#!/usr/bin/env python
# -*- coding: utf-8 -*-
from nltk.corpus import stopwords
import sys

common_words = []

def init_common_words():
    global common_words
    input_file = '/home/ivy/git/Social/Social/resource/filter.txt'
    with open(input_file, 'r+') as f:
        for line in f:
            common_words.append(line.strip())

def main():
    #http://www.wordfrequency.info/top5000.asp
    #awk '{if($3!="n" || $4>50000) print $2 }' 5000.txt > filter.txt
    global common_words
    init_common_words()
    
    stop = stopwords.words('english')
    operators = ['http', 'co', 'rt', 'https', 'via', 'amp', 're', 'fuck', 'shit']

    for line in sys.stdin:
        arr = line.strip().split('\t')
        user = arr[0]
        text = arr[1]
        for word in text.split():
            if word.lower() not in stop \
                and word.lower() not in common_words \
                and word.lower() not in operators \
                and not word.isdigit() \
                and len(word) > 1:
                print '%s-%s\t%s' % (user, word.lower(), '1')

if __name__ == '__main__':
    main()     
    
